##架構圖
#Loading multiple .csv files as separate data frames
getwd()
folder <- "data/"
file_list <- list.files(path = folder, pattern = "*.csv")#Read in each .csv file
for (i in 1:length(file_list)){
assign(file_list[i],
read.csv(paste(folder,file_list[i],sep=''),stringsAsFactors = F)
)}#Rename data
geo <- olist_geolocation_dataset.csv
orders <- olist_orders_dataset.csv
cust <- olist_customers_dataset.csv
sellers <- olist_sellers_dataset.csv
products <- olist_products_dataset.csv
orderitems <- olist_order_items_dataset.csv
payments <- olist_order_payments_dataset.csv
nametrans <- product_category_name_translation.csv
reviews <- olist_order_reviews_dataset.csv
closed <- olist_closed_deals_dataset.csv
marketing <- olist_marketing_qualified_leads_dataset.csv# 地理資料整理
geo$geolocation_lat<-round(geo$geolocation_lat,3)
geo$geolocation_lng<-round(geo$geolocation_lng,3)
selllocation<-geo %>% group_by(geolocation_city) %>% summarise(selllat = max(geolocation_lat),selllng=max(geolocation_lng))
custlocation<-geo %>% group_by(geolocation_city) %>% summarise(custlat = max(geolocation_lat),custlng=max(geolocation_lng))# 時間資料處理
orders$order_approved_at<-as.Date(orders$order_approved_at,format="%Y-%m-%d %H:%M:%S")
orders$order_purchase_timestamp<-as.Date(orders$order_purchase_timestamp,format="%Y-%m-%d %H:%M:%S")
orders$order_delivered_carrier_date<-as.Date(orders$order_delivered_carrier_date,format="%Y-%m-%d %H:%M:%S")
orders$order_delivered_customer_date<-as.Date(orders$order_delivered_customer_date,format="%Y-%m-%d %H:%M:%S")
orders$order_estimated_delivery_date<-as.Date(orders$order_estimated_delivery_date,format="%Y-%m-%d %H:%M:%S")
orderitems$shipping_limit_date<-as.Date(orderitems$shipping_limit_date,format="%Y-%m-%d %H:%M:%S")# 把各個資料合併
M_1 <- merge(orderitems,sellers,by.x="seller_id",by.y="seller_id")
M_2 <- merge(orders,cust,by.x="customer_id",by.y="customer_id")
M_3 <- merge(M_2,M_1,by="order_id")
M_4 <- merge(M_3,products,by="product_id")
M_5 <- merge(M_4,payments,by="order_id")
M_6 <- merge(M_5,selllocation,by.x="seller_city",by.y="geolocation_city")
M_7 <- merge(M_6,custlocation,by.x="customer_city",by.y="geolocation_city")
colnames(nametrans) <- c("product_category_name","product_category_name_english")#計算買賣家之間的距離
dist_list <- list()
for (i in 1:nrow(M_7)) {
dist_list[[i]] <- gdist(lon.1 = M_7$selllng[i],
lat.1 = M_7$selllat[i],
lon.2 = M_7$custlng[i],
lat.2 = M_7$custlat[i],
units="miles")
}
M_7$distbtwn<-as.integer(dist_list)
M_8<-merge(M_7,nametrans,by="product_category_name")💡1-1.訂單數量
ts = as.POSIXct(as.character(olist_orders_dataset.csv$order_purchase_timestamp) , format="%Y-%m-%d %T")
ts.bym <- cut(ts, breaks = "month")
dfts <- data.frame(ts,ts.bym)
dim(dfts) #共有99441筆訂單[1] 99441 2
ggplot(dfts, aes(ts.bym))+
geom_bar(fill="lightblue")+
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5))+
ylab("Count") +
xlab("Month") +
theme(panel.background = element_rect(fill = "#f1f1f1")) #最多銷售月份為2017-11-1ts.byH <- format(ts,format="%H") %>% data.frame()
ggplot(ts.byH ,aes(.))+
geom_bar(fill="lightblue")+
ylab("Count") +
xlab("Time") +
theme(panel.background = element_rect(fill = "#f1f1f1")) #中午到傍晚時段為下單尖峰##按照小時區分
DD.byh <- format(DD,format="%H")
D$byh <- DD.byh
#Rearranging data from D data frame
deaf = select(D, product_category_name_english,byh)
shook = group_by(deaf, byh, product_category_name_english) %>% summarise(n=n())
jump = spread(shook, byh, n)
jump[is.na(jump)] <- 0
#Converting product category column into rowname
jump = column_to_rownames(jump, loc = "product_category_name_english")
jump = as.matrix(jump)
#Using plot_ly () to draw the interactive heatmap
plot_ly(x=colnames(jump), y=rownames(jump), z = jump, type = "heatmap")💡1-2.賣家數量及地理分布
[1] 2962
ggplot() +
geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="gray")+
geom_point(data= M_8,aes(x=selllng,y=selllat,color=seller_state),size=0.2)💡1-3.買家數量及地理分布
[1] 92458
ggplot() +
geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="gray")+
geom_point(data= M_8,aes(x=custlng,y=custlat,color=customer_state),size=0.2)Q <- group_by(olist_customers_dataset.csv, customer_unique_id) %>% summarise(nid=n())
dim(Q) #共有96096名顧客[1] 96096 2
1 2 3 4 5 6 7 9 17
0.969 0.029 0.002 0.000 0.000 0.000 0.000 0.000 0.000
housewares computers_accessories furniture_decor
6527 7275 7513
sports_leisure health_beauty bed_bath_table
7954 9103 10001
health_beauty <- filter(M_14_4, product_category_name_english=="health_beauty")
ggplot(health_beauty, aes(x=customer_state, y=price,col="grey")) +
geom_point(color="orange", size=4) +
geom_segment( aes(x=customer_state, xend=customer_state, y=0, yend=price))+theme_light() +
theme(
panel.grid.major.x = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank()
) 💡1-4.產品類別銷售及地理區域市佔情形
ggplot() +
geom_bar(data= M_8,aes(product_category_name_english,fill=seller_state),width=1)+
coord_flip()+
theme(axis.text.y = element_text( size=5))+
ylab("商品類別") +
xlab("數量") + 互動式圓餅圖 : 不同商品類別的平均評分及銷量佔比 + 商品種類後面的數字是其平均分數
Product2 <- group_by(M_15_3,business_segment)%>%summarize(score2=round(mean(review_score),2),percent=n()/nrow(M_15_3)*100)
plot_ly(Product2, labels = paste(Product2$business_segment,Product2$score2),values = Product2$percent, type = 'pie') %>%
layout(title = '各商品類別銷售百分比',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))💡2-1.每月的訂單數,新進的買、賣家數趨勢變化
marketingdata <- merge(closed[,c(1,2,6,8,9,12)],marketing[,c(1,4)],by = "mql_id")
M_9 <- merge(M_8,marketingdata,by = "seller_id",all.x = T)M_9[,9]=as.POSIXct(M_9[,9], format="%Y-%m-%d")
M_9$time_group <- format(M_9$order_purchase_timestamp,"%Y%m") %>% as.numeric()
M_9$time <- format(M_9$order_purchase_timestamp,"%Y%m") %>% paste0(.,"28") %>% as.Date("%Y%m%d")
time_group <- unique(M_9$time_group) %>% sort()
num_seller_1 = sapply(1:length(time_group), # start by 2, so i-1 = 1
function(i) setdiff(M_9$seller_id[M_9$time_group==time_group[i]],unique(M_9$seller_id[M_9$time_group<time_group[i]])) %>% length)
num_seller = c(0,num_seller_1[-24])
num_customer_1 = sapply(1:length(time_group), # start by 2, so i-1 = 1
function(i) setdiff(M_9$customer_unique_id[M_9$time_group==time_group[i]],unique(M_9$customer_unique_id[M_9$time_group<time_group[i]])) %>% length)
num_customer = c(0,num_customer_1[-24])
num_order <- M_9 %>% group_by(time) %>% summarise(
num_order = length(unique(order_id))
)
plot <- cbind(time=(sort(unique(M_9$time))),num_seller,num_customer,num_order[,2])data.ts<-zoo(plot,plot[,"time"])
plot <- data.ts[1:24,-1]
dygraph(plot,main = "Olist新進買賣家及訂單數走勢") %>%
dySeries("num_customer", label = "新進顧客數")%>%
dySeries("num_order", label = "訂單數") %>%
dySeries("num_seller", axis = 'y2', label = "新進賣家數") %>%
dyOptions( axisLineColor="orange",
gridLineColor="indianred" , fillGraph = F,fillAlpha = 0.2,
drawGrid = TRUE,drawPoints=TRUE, pointSize = 1 ) %>%
dyAxis("x", label = " 日期 ", drawGrid = F) %>%
dyAxis("y", label = " ", drawGrid = T) %>%
dyHighlight(highlightCircleSize = 3,
highlightSeriesBackgroundAlpha = 0.2) %>%
dyOptions(colors = RColorBrewer::brewer.pal(3, 'Dark2')) %>%
dyRangeSelector(height = 1)<!–hid="htmlwidget-000cc18a38dd705c5405" style="width:672px;height:480px;" class="dygraphs html-widget">